512 Project

Team: Chenxi Liu, Yuan Liu, Qian Yi, Jingyu Zhang

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(tree)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(class)
library (glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
library(corrplot)
## corrplot 0.84 loaded
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(tidyverse)
## Registered S3 method overwritten by 'cli':
##   method     from
##   print.tree tree
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble  3.0.1     ✔ purrr   0.3.4
## ✔ tidyr   1.0.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ tidyr::expand()         masks Matrix::expand()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::lift()           masks caret::lift()
## ✖ randomForest::margin()  masks ggplot2::margin()
## ✖ tidyr::pack()           masks Matrix::pack()
## ✖ tidyr::unpack()         masks Matrix::unpack()
library(cluster) 

EDA

wine=read.csv("wine_white.csv")
summary(wine)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0       
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0       
##  Median :0.04300   Median : 34.00      Median :134.0       
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4       
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0       
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0       
##     density             pH          sulphates         alcohol     
##  Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50  
##  Median :0.9937   Median :3.180   Median :0.4700   Median :10.40  
##  Mean   :0.9940   Mean   :3.188   Mean   :0.4898   Mean   :10.51  
##  3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40  
##  Max.   :1.0390   Max.   :3.820   Max.   :1.0800   Max.   :14.20  
##     quality     
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.878  
##  3rd Qu.:6.000  
##  Max.   :9.000
head(wine)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.0             0.27        0.36           20.7     0.045
## 2           6.3             0.30        0.34            1.6     0.049
## 3           8.1             0.28        0.40            6.9     0.050
## 4           7.2             0.23        0.32            8.5     0.058
## 5           7.2             0.23        0.32            8.5     0.058
## 6           8.1             0.28        0.40            6.9     0.050
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  45                  170  1.0010 3.00      0.45     8.8
## 2                  14                  132  0.9940 3.30      0.49     9.5
## 3                  30                   97  0.9951 3.26      0.44    10.1
## 4                  47                  186  0.9956 3.19      0.40     9.9
## 5                  47                  186  0.9956 3.19      0.40     9.9
## 6                  30                   97  0.9951 3.26      0.44    10.1
##   quality
## 1       6
## 2       6
## 3       6
## 4       6
## 5       6
## 6       6
pairs(quality~.,data=wine, main="simple scatterplot matrix")

corr=cor(wine)
corrplot(corr, method="circle")

wine$quality =as.factor(wine$quality)
wine %>%
  mutate(quality = as.factor(quality)) %>%
  select(-c(residual.sugar, free.sulfur.dioxide, total.sulfur.dioxide, chlorides)) %>% 
  ggpairs(aes(color = quality, alpha = 0.4),
          columns = 1:7,
          lower = list(continuous = "points"),
          upper = list(continuous = "blank"), 
          axisLabels = "none", switch = "both")

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(viridis)
## Loading required package: viridisLite
wine %>% 

  plot_ly(x=~alcohol,y=~volatile.acidity,z= ~sulphates, color=~quality, hoverinfo = 'text', colors = viridis(3),

          text = ~paste('Quality:', quality,

                        '<br>Alcohol:', alcohol,

                        '<br>Volatile Acidity:', volatile.acidity,

                        '<br>sulphates:', sulphates)) %>% 

  add_markers(opacity = 0.8) %>%

  layout(title = "3D Wine Quality",

         annotations=list(yref='paper',xref="paper",y=1.05,x=1.1, text="quality",showarrow=F), scene = list(xaxis = list(title = 'Alcohol'),

                      yaxis = list(title = 'Volatile Acidity'),

                      zaxis = list(title = 'sulphates')))
#Distribution of wine quality ratings
ggplot(wine, aes(x = quality)) +
geom_bar(stat = "count",position = "dodge") + ggtitle("Distribution of Wine Quality Ratings") + theme_classic()

Clustering

Data Loading

data = read.csv("wine_white.csv")
head(data)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.0             0.27        0.36           20.7     0.045
## 2           6.3             0.30        0.34            1.6     0.049
## 3           8.1             0.28        0.40            6.9     0.050
## 4           7.2             0.23        0.32            8.5     0.058
## 5           7.2             0.23        0.32            8.5     0.058
## 6           8.1             0.28        0.40            6.9     0.050
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  45                  170  1.0010 3.00      0.45     8.8
## 2                  14                  132  0.9940 3.30      0.49     9.5
## 3                  30                   97  0.9951 3.26      0.44    10.1
## 4                  47                  186  0.9956 3.19      0.40     9.9
## 5                  47                  186  0.9956 3.19      0.40     9.9
## 6                  30                   97  0.9951 3.26      0.44    10.1
##   quality
## 1       6
## 2       6
## 3       6
## 4       6
## 5       6
## 6       6
summary(data)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0       
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0       
##  Median :0.04300   Median : 34.00      Median :134.0       
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4       
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0       
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0       
##     density             pH          sulphates         alcohol     
##  Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50  
##  Median :0.9937   Median :3.180   Median :0.4700   Median :10.40  
##  Mean   :0.9940   Mean   :3.188   Mean   :0.4898   Mean   :10.51  
##  3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40  
##  Max.   :1.0390   Max.   :3.820   Max.   :1.0800   Max.   :14.20  
##     quality     
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.878  
##  3rd Qu.:6.000  
##  Max.   :9.000

K-Means Clustering

Elbow Method
df = data.frame(scale(data[1:11])) #scale dataset
df = na.omit(df)
head(df)
##   fixed.acidity volatile.acidity citric.acid residual.sugar   chlorides
## 1     0.1720794      -0.08176155  0.21325843      2.8210611 -0.03535139
## 2    -0.6574340       0.21587359  0.04799622     -0.9446688  0.14773200
## 3     1.4756004       0.01745016  0.54378284      0.1002720  0.19350284
## 4     0.4090832      -0.47860841 -0.11726599      0.4157258  0.55966962
## 5     0.4090832      -0.47860841 -0.11726599      0.4157258  0.55966962
## 6     1.4756004       0.01745016  0.54378284      0.1002720  0.19350284
##   free.sulfur.dioxide total.sulfur.dioxide      density          pH
## 1           0.5698734            0.7444890  2.331273996 -1.24679399
## 2          -1.2528907           -0.1496693 -0.009153237  0.73995309
## 3          -0.3121093           -0.9732363  0.358628185  0.47505348
## 4           0.6874711            1.1209768  0.525801559  0.01147916
## 5           0.6874711            1.1209768  0.525801559  0.01147916
## 6          -0.3121093           -0.9732363  0.358628185  0.47505348
##     sulphates    alcohol
## 1 -0.34914861 -1.3930102
## 2  0.00134171 -0.8241915
## 3 -0.43677119 -0.3366326
## 4 -0.78726151 -0.4991523
## 5 -0.78726151 -0.4991523
## 6 -0.43677119 -0.3366326
summary(df)
##  fixed.acidity      volatile.acidity   citric.acid      residual.sugar   
##  Min.   :-3.61998   Min.   :-1.9668   Min.   :-2.7615   Min.   :-1.1418  
##  1st Qu.:-0.65743   1st Qu.:-0.6770   1st Qu.:-0.5304   1st Qu.:-0.9250  
##  Median :-0.06492   Median :-0.1810   Median :-0.1173   Median :-0.2349  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.52758   3rd Qu.: 0.4143   3rd Qu.: 0.4612   3rd Qu.: 0.6917  
##  Max.   : 8.70422   Max.   : 8.1528   Max.   :10.9553   Max.   :11.7129  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :-1.6831   Min.   :-1.95848    Min.   :-3.0439     
##  1st Qu.:-0.4473   1st Qu.:-0.72370    1st Qu.:-0.7144     
##  Median :-0.1269   Median :-0.07691    Median :-0.1026     
##  Mean   : 0.0000   Mean   : 0.00000    Mean   : 0.0000     
##  3rd Qu.: 0.1935   3rd Qu.: 0.62867    3rd Qu.: 0.6739     
##  Max.   :13.7417   Max.   :14.91679    Max.   : 7.0977     
##     density               pH             sulphates      
##  Min.   :-2.31280   Min.   :-3.10109   Min.   :-2.3645  
##  1st Qu.:-0.77063   1st Qu.:-0.65077   1st Qu.:-0.6996  
##  Median :-0.09608   Median :-0.05475   Median :-0.1739  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.69298   3rd Qu.: 0.60750   3rd Qu.: 0.5271  
##  Max.   :15.02976   Max.   : 4.18365   Max.   : 5.1711  
##     alcohol        
##  Min.   :-2.04309  
##  1st Qu.:-0.82419  
##  Median :-0.09285  
##  Mean   : 0.00000  
##  3rd Qu.: 0.71974  
##  Max.   : 2.99502
wss <- (nrow(df)-1)*sum(apply(df,2,var))
  for (i in 2:15) wss[i] <- sum(kmeans(df, centers=i)$withinss)
## Warning: did not converge in 10 iterations
plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within-cluster sum of squares")

Average Silhouette Method
silhouette_score <- function(k){
  km <- kmeans(df, centers = k, nstart=20, iter.max=50)
  ss <- silhouette(km$cluster, dist(df))
  mean(ss[, 3])
}
k <- 2:10
avg_sil <- sapply(k, silhouette_score)
plot(k, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)

fviz_nbclust(df, kmeans, method='silhouette')

Gap Statistic Method
# compute gap statistic
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50, iter.max=50)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 244900)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 244900)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 244900)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25,     iter.max = 50)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 2
##           logW   E.logW      gap      SE.sim
##  [1,] 8.598970 9.886755 1.287785 0.002156290
##  [2,] 8.475417 9.803630 1.328213 0.002382922
##  [3,] 8.429626 9.753553 1.323927 0.002191498
##  [4,] 8.399495 9.717269 1.317774 0.002405187
##  [5,] 8.364682 9.695848 1.331166 0.002272476
##  [6,] 8.338565 9.676000 1.337435 0.002228208
##  [7,] 8.314816 9.658963 1.344147 0.002219907
##  [8,] 8.295053 9.642652 1.347599 0.002360697
##  [9,] 8.274388 9.629123 1.354735 0.002294227
## [10,] 8.260260 9.616375 1.356115 0.002315594
fviz_gap_stat(gap_stat)

K-Means Result
set.seed(2020)
km.out1=kmeans(df,2,nstart=20)
km.out1$tot.withinss
## [1] 42539.96
fviz_cluster(km.out1, data = df)

Hierarchical Clustering

## ramdomly select 5 rows from each quality degree
d3 = data[data$quality == 3, ]
d3 = d3[sample(nrow(d3), 5), ]
d3
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3266           4.2            0.215        0.23            5.1     0.041
## 1932           7.1            0.490        0.22            2.0     0.047
## 3410           6.2            0.230        0.35            0.7     0.051
## 1485           7.5            0.320        0.24            4.6     0.053
## 3088           6.1            0.200        0.34            9.5     0.041
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates
## 3266                64.0                157.0 0.99688 3.42      0.44
## 1932               146.5                307.5 0.99240 3.24      0.37
## 3410                24.0                111.0 0.99160 3.37      0.43
## 1485                 8.0                134.0 0.99580 3.14      0.50
## 3088                38.0                201.0 0.99500 3.14      0.44
##      alcohol quality
## 3266     8.0       3
## 1932    11.0       3
## 3410    11.0       3
## 1485     9.1       3
## 3088    10.1       3
d4 = data[data$quality == 4, ]
d4 = d4[sample(nrow(d4), 5), ]
d4
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1703           6.5             0.27        0.19            4.2     0.046
## 2533           6.7             0.54        0.27            7.1     0.049
## 1060           5.3             0.30        0.20            1.1     0.077
## 907            8.3             0.27        0.45            1.3     0.048
## 3219           6.5             0.29        0.25            2.5     0.142
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates
## 1703                   6                  114 0.99550 3.25      0.35
## 2533                   8                  178 0.99502 3.16      0.38
## 1060                  48                  166 0.99440 3.30      0.54
## 907                    8                   72 0.99440 3.08      0.61
## 3219                   8                  111 0.99270 3.00      0.44
##      alcohol quality
## 1703     8.6       4
## 2533     9.4       4
## 1060     8.7       4
## 907     10.3       4
## 3219     9.9       4
d5 = data[data$quality == 5, ]
d5 = d5[sample(nrow(d5), 5), ]
d5
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 2594           6.3             0.21        0.29           11.7     0.048
## 430            7.1             0.31        0.47           13.6     0.056
## 785            7.2             0.23        0.19           13.7     0.052
## 1145           7.2             0.25        0.32            1.5     0.047
## 2540           6.2             0.27        0.18            1.5     0.028
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates
## 2594                  49                  147 0.99482 3.22      0.38
## 430                   54                  197 0.99780 3.10      0.49
## 785                   47                  197 0.99865 3.12      0.53
## 1145                  27                  132 0.99330 3.26      0.44
## 2540                  20                  111 0.99228 3.41      0.50
##      alcohol quality
## 2594    10.8       5
## 430      9.3       5
## 785      9.0       5
## 1145    10.5       5
## 2540    10.0       5
d6 = data[data$quality == 6, ]
d6 = d6[sample(nrow(d6), 5), ]
d6
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3224           8.2             0.30        0.44           12.4     0.043
## 2218           8.1             0.25        0.38            3.8     0.051
## 3978           6.9             0.27        0.25            7.5     0.030
## 1483           6.7             0.16        0.49            2.4     0.046
## 1920           7.3             0.22        0.50           13.7     0.049
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates
## 3224                  52                  154 0.99452 3.04      0.33
## 2218                  18                  129 0.99280 3.21      0.38
## 3978                  18                  117 0.99116 3.09      0.38
## 1483                  57                  187 0.99520 3.62      0.81
## 1920                  56                  189 0.99940 3.24      0.66
##      alcohol quality
## 3224    12.0       6
## 2218    11.5       6
## 3978    13.0       6
## 1483    10.4       6
## 1920     9.0       6
d7 = data[data$quality == 7, ]
d7 = d7[sample(nrow(d7), 5), ]
d7
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 600            6.9             0.19        0.40           1.40     0.036
## 2556           6.3             0.13        0.42           1.10     0.043
## 3597           6.6             0.17        0.28           1.10     0.034
## 352            7.3             0.33        0.40           6.85     0.038
## 2176           7.4             0.19        0.30          12.80     0.053
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates
## 600                 14.0                   55 0.99090 3.08      0.68
## 2556                63.0                  146 0.99066 3.13      0.72
## 3597                55.0                  108 0.98939 3.00      0.52
## 352                 32.0                  138 0.99200 3.03      0.30
## 2176                48.5                  212 0.99860 3.14      0.49
##      alcohol quality
## 600     11.5       7
## 2556    11.2       7
## 3597    11.9       7
## 352     11.9       7
## 2176     9.1       7
d8 = data[data$quality == 8, ]
d8 = d8[sample(nrow(d8), 5), ]
d8
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 160            5.2             0.44        0.04           1.40     0.036
## 3462           6.7             0.24        0.30           3.85     0.042
## 331            6.4             0.32        0.35           4.80     0.030
## 3071           6.8             0.28        0.43           7.60     0.030
## 1633           7.1             0.28        0.49           6.50     0.041
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates
## 160                   43                  119 0.98940 3.36      0.33
## 3462                 105                  179 0.99189 3.04      0.59
## 331                   34                  101 0.99120 3.36      0.60
## 3071                  30                  110 0.99164 3.08      0.59
## 1633                  28                  111 0.99260 3.41      0.58
##      alcohol quality
## 160     12.1       8
## 3462    11.3       8
## 331     12.5       8
## 3071    12.5       8
## 1633    12.2       8
d9 = data[data$quality == 9, ]
d9 = d9[sample(nrow(d9), 5), ]
d9
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 828            7.4             0.24        0.36            2.0     0.031
## 877            6.9             0.36        0.34            4.2     0.018
## 821            6.6             0.36        0.29            1.6     0.021
## 775            9.1             0.27        0.45           10.6     0.035
## 1606           7.1             0.26        0.49            2.2     0.032
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates
## 828                   27                  139 0.99055 3.28      0.48
## 877                   57                  119 0.98980 3.28      0.36
## 821                   24                   85 0.98965 3.41      0.61
## 775                   28                  124 0.99700 3.20      0.46
## 1606                  31                  113 0.99030 3.37      0.42
##      alcohol quality
## 828     12.5       9
## 877     12.7       9
## 821     12.4       9
## 775     10.4       9
## 1606    12.9       9
new_df = rbind(d3,d4,d5,d6,d7,d8,d9)
new_df = new_df[1:11]
new_df
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3266           4.2            0.215        0.23           5.10     0.041
## 1932           7.1            0.490        0.22           2.00     0.047
## 3410           6.2            0.230        0.35           0.70     0.051
## 1485           7.5            0.320        0.24           4.60     0.053
## 3088           6.1            0.200        0.34           9.50     0.041
## 1703           6.5            0.270        0.19           4.20     0.046
## 2533           6.7            0.540        0.27           7.10     0.049
## 1060           5.3            0.300        0.20           1.10     0.077
## 907            8.3            0.270        0.45           1.30     0.048
## 3219           6.5            0.290        0.25           2.50     0.142
## 2594           6.3            0.210        0.29          11.70     0.048
## 430            7.1            0.310        0.47          13.60     0.056
## 785            7.2            0.230        0.19          13.70     0.052
## 1145           7.2            0.250        0.32           1.50     0.047
## 2540           6.2            0.270        0.18           1.50     0.028
## 3224           8.2            0.300        0.44          12.40     0.043
## 2218           8.1            0.250        0.38           3.80     0.051
## 3978           6.9            0.270        0.25           7.50     0.030
## 1483           6.7            0.160        0.49           2.40     0.046
## 1920           7.3            0.220        0.50          13.70     0.049
## 600            6.9            0.190        0.40           1.40     0.036
## 2556           6.3            0.130        0.42           1.10     0.043
## 3597           6.6            0.170        0.28           1.10     0.034
## 352            7.3            0.330        0.40           6.85     0.038
## 2176           7.4            0.190        0.30          12.80     0.053
## 160            5.2            0.440        0.04           1.40     0.036
## 3462           6.7            0.240        0.30           3.85     0.042
## 331            6.4            0.320        0.35           4.80     0.030
## 3071           6.8            0.280        0.43           7.60     0.030
## 1633           7.1            0.280        0.49           6.50     0.041
## 828            7.4            0.240        0.36           2.00     0.031
## 877            6.9            0.360        0.34           4.20     0.018
## 821            6.6            0.360        0.29           1.60     0.021
## 775            9.1            0.270        0.45          10.60     0.035
## 1606           7.1            0.260        0.49           2.20     0.032
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates
## 3266                64.0                157.0 0.99688 3.42      0.44
## 1932               146.5                307.5 0.99240 3.24      0.37
## 3410                24.0                111.0 0.99160 3.37      0.43
## 1485                 8.0                134.0 0.99580 3.14      0.50
## 3088                38.0                201.0 0.99500 3.14      0.44
## 1703                 6.0                114.0 0.99550 3.25      0.35
## 2533                 8.0                178.0 0.99502 3.16      0.38
## 1060                48.0                166.0 0.99440 3.30      0.54
## 907                  8.0                 72.0 0.99440 3.08      0.61
## 3219                 8.0                111.0 0.99270 3.00      0.44
## 2594                49.0                147.0 0.99482 3.22      0.38
## 430                 54.0                197.0 0.99780 3.10      0.49
## 785                 47.0                197.0 0.99865 3.12      0.53
## 1145                27.0                132.0 0.99330 3.26      0.44
## 2540                20.0                111.0 0.99228 3.41      0.50
## 3224                52.0                154.0 0.99452 3.04      0.33
## 2218                18.0                129.0 0.99280 3.21      0.38
## 3978                18.0                117.0 0.99116 3.09      0.38
## 1483                57.0                187.0 0.99520 3.62      0.81
## 1920                56.0                189.0 0.99940 3.24      0.66
## 600                 14.0                 55.0 0.99090 3.08      0.68
## 2556                63.0                146.0 0.99066 3.13      0.72
## 3597                55.0                108.0 0.98939 3.00      0.52
## 352                 32.0                138.0 0.99200 3.03      0.30
## 2176                48.5                212.0 0.99860 3.14      0.49
## 160                 43.0                119.0 0.98940 3.36      0.33
## 3462               105.0                179.0 0.99189 3.04      0.59
## 331                 34.0                101.0 0.99120 3.36      0.60
## 3071                30.0                110.0 0.99164 3.08      0.59
## 1633                28.0                111.0 0.99260 3.41      0.58
## 828                 27.0                139.0 0.99055 3.28      0.48
## 877                 57.0                119.0 0.98980 3.28      0.36
## 821                 24.0                 85.0 0.98965 3.41      0.61
## 775                 28.0                124.0 0.99700 3.20      0.46
## 1606                31.0                113.0 0.99030 3.37      0.42
##      alcohol
## 3266     8.0
## 1932    11.0
## 3410    11.0
## 1485     9.1
## 3088    10.1
## 1703     8.6
## 2533     9.4
## 1060     8.7
## 907     10.3
## 3219     9.9
## 2594    10.8
## 430      9.3
## 785      9.0
## 1145    10.5
## 2540    10.0
## 3224    12.0
## 2218    11.5
## 3978    13.0
## 1483    10.4
## 1920     9.0
## 600     11.5
## 2556    11.2
## 3597    11.9
## 352     11.9
## 2176     9.1
## 160     12.1
## 3462    11.3
## 331     12.5
## 3071    12.5
## 1633    12.2
## 828     12.5
## 877     12.7
## 821     12.4
## 775     10.4
## 1606    12.9
hc.complete=hclust(dist(new_df), method="complete")
plot(hc.complete,main="Complete Linkage", xlab="index of selected row", sub="", cex =.9)

hc.average=hclust(dist(new_df), method="average") 
plot(hc.average , main="Average Linkage", xlab="index of selected row", sub="", cex =.9)

hc.single=hclust(dist(new_df), method="single")
plot(hc.single , main="Single Linkage", xlab="index of selected row", sub="", cex =.9)

Decision Tree

Data Preparation

data = read.csv("wine_white.csv")
head(data)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.0             0.27        0.36           20.7     0.045
## 2           6.3             0.30        0.34            1.6     0.049
## 3           8.1             0.28        0.40            6.9     0.050
## 4           7.2             0.23        0.32            8.5     0.058
## 5           7.2             0.23        0.32            8.5     0.058
## 6           8.1             0.28        0.40            6.9     0.050
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  45                  170  1.0010 3.00      0.45     8.8
## 2                  14                  132  0.9940 3.30      0.49     9.5
## 3                  30                   97  0.9951 3.26      0.44    10.1
## 4                  47                  186  0.9956 3.19      0.40     9.9
## 5                  47                  186  0.9956 3.19      0.40     9.9
## 6                  30                   97  0.9951 3.26      0.44    10.1
##   quality
## 1       6
## 2       6
## 3       6
## 4       6
## 5       6
## 6       6
#summary(data)
set.seed(210)
table(data$quality)
## 
##    3    4    5    6    7    8    9 
##   20  163 1457 2198  880  175    5
### we can see there the data is imbalanced.
### to deal with this data imbalance we will use oversampling 
data <- data[sample(1:nrow(data),nrow(data),prob = ifelse(data$quality == c("9","3"), 0.95 , 0.10), replace = TRUE),]
table(data$quality)
## 
##    3    4    5    6    7    8    9 
##  126  149 1399 2133  888  172   31

split into 3 clusters:

if quality < 5 -> low;

if 6 <= quality < 7 -> median;

if quality >= 7 -> high

library(dplyr)
library(caret)
library(tree)
data = data %>% mutate(quality_degree = case_when(quality >= 7 ~ 'High', quality >= 5 ~ 'Mid', TRUE ~ 'Low'))
data=data[, c(1:11, 13)]
data$quality_degree=as.factor(data$quality_degree)
summary(data)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.200   Min.   :0.080    Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.210    1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.260    Median :0.3100   Median : 5.000  
##  Mean   : 6.867   Mean   :0.277    Mean   :0.3325   Mean   : 6.282  
##  3rd Qu.: 7.300   3rd Qu.:0.320    3rd Qu.:0.3800   3rd Qu.: 9.700  
##  Max.   :14.200   Max.   :1.100    Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.00900   Min.   :  2.00      Min.   : 10.0       
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:107.0       
##  Median :0.04300   Median : 34.00      Median :134.0       
##  Mean   :0.04519   Mean   : 36.05      Mean   :139.3       
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:168.0       
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0       
##     density             pH          sulphates         alcohol     
##  Min.   :0.9874   Min.   :2.740   Min.   :0.2200   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50  
##  Median :0.9937   Median :3.180   Median :0.4700   Median :10.40  
##  Mean   :0.9940   Mean   :3.189   Mean   :0.4919   Mean   :10.54  
##  3rd Qu.:0.9960   3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40  
##  Max.   :1.0390   Max.   :3.810   Max.   :1.0800   Max.   :14.20  
##  quality_degree
##  High:1091     
##  Low : 275     
##  Mid :3532     
##                
##                
## 
# splid the data into training and testing 
set.seed(13)
#dim(data)
training_index=sample(1:dim(data)[1], dim(data)[1]*0.8)
testing_index=-training_index

training_set=data[training_index, ]
testing_set=data[testing_index, ]
summary(training_set)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.200   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3100   Median : 4.900  
##  Mean   : 6.871   Mean   :0.2758   Mean   :0.3316   Mean   : 6.292  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3800   3rd Qu.: 9.800  
##  Max.   :10.700   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.00900   Min.   :  2.0       Min.   : 10.0       
##  1st Qu.:0.03600   1st Qu.: 23.0       1st Qu.:107.0       
##  Median :0.04300   Median : 34.0       Median :134.0       
##  Mean   :0.04544   Mean   : 36.1       Mean   :139.4       
##  3rd Qu.:0.05000   3rd Qu.: 46.0       3rd Qu.:168.0       
##  Max.   :0.34600   Max.   :289.0       Max.   :440.0       
##     density             pH          sulphates        alcohol     
##  Min.   :0.9874   Min.   :2.740   Min.   :0.250   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.080   1st Qu.:0.410   1st Qu.: 9.50  
##  Median :0.9937   Median :3.180   Median :0.470   Median :10.40  
##  Mean   :0.9940   Mean   :3.189   Mean   :0.492   Mean   :10.52  
##  3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.550   3rd Qu.:11.40  
##  Max.   :1.0390   Max.   :3.810   Max.   :1.080   Max.   :14.20  
##  quality_degree
##  High: 845     
##  Low : 224     
##  Mid :2849     
##                
##                
## 

Build Decision Tree Model

set.seed(13)
#dim(data)
training_index=sample(1:dim(data)[1], dim(data)[1]*0.8)
testing_index=-training_index

training_set=data[training_index, ]
testing_set=data[testing_index, ]
summary(training_set)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 4.200   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3100   Median : 4.900  
##  Mean   : 6.871   Mean   :0.2758   Mean   :0.3316   Mean   : 6.292  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3800   3rd Qu.: 9.800  
##  Max.   :10.700   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.00900   Min.   :  2.0       Min.   : 10.0       
##  1st Qu.:0.03600   1st Qu.: 23.0       1st Qu.:107.0       
##  Median :0.04300   Median : 34.0       Median :134.0       
##  Mean   :0.04544   Mean   : 36.1       Mean   :139.4       
##  3rd Qu.:0.05000   3rd Qu.: 46.0       3rd Qu.:168.0       
##  Max.   :0.34600   Max.   :289.0       Max.   :440.0       
##     density             pH          sulphates        alcohol     
##  Min.   :0.9874   Min.   :2.740   Min.   :0.250   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.080   1st Qu.:0.410   1st Qu.: 9.50  
##  Median :0.9937   Median :3.180   Median :0.470   Median :10.40  
##  Mean   :0.9940   Mean   :3.189   Mean   :0.492   Mean   :10.52  
##  3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.550   3rd Qu.:11.40  
##  Max.   :1.0390   Max.   :3.810   Max.   :1.080   Max.   :14.20  
##  quality_degree
##  High: 845     
##  Low : 224     
##  Mid :2849     
##                
##                
## 
tree.wine = tree(quality_degree~., data=training_set)
summary(tree.wine)
## 
## Classification tree:
## tree(formula = quality_degree ~ ., data = training_set)
## Variables actually used in tree construction:
## [1] "alcohol"              "volatile.acidity"     "free.sulfur.dioxide" 
## [4] "residual.sugar"       "total.sulfur.dioxide"
## Number of terminal nodes:  10 
## Residual mean deviance:  1.153 = 4507 / 3908 
## Misclassification error rate: 0.2422 = 949 / 3918
# the training error rate is 0.2422
# there are 10 terminal nodes 

Plot Tree

plot(tree.wine)
text(tree.wine, pretty=0)

Decission Tree Evaluation Using Test Set

set.seed(213)
pred = predict(tree.wine, testing_set, type = "class")
caret::confusionMatrix(pred, testing_set$quality_degree)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low Mid
##       High   44   0  30
##       Low     1  10   6
##       Mid   201  41 647
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7153          
##                  95% CI : (0.6859, 0.7434)
##     No Information Rate : 0.6969          
##     P-Value [Acc > NIR] : 0.1115          
##                                           
##                   Kappa : 0.1817          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: High Class: Low Class: Mid
## Sensitivity              0.17886    0.19608     0.9473
## Specificity              0.95913    0.99247     0.1852
## Pos Pred Value           0.59459    0.58824     0.7278
## Neg Pred Value           0.77704    0.95742     0.6044
## Prevalence               0.25102    0.05204     0.6969
## Detection Rate           0.04490    0.01020     0.6602
## Detection Prevalence     0.07551    0.01735     0.9071
## Balanced Accuracy        0.56899    0.59427     0.5662
# accuracy: 0.752 

Purning Tree

set.seed(11)
cross_validation_tree =cv.tree(tree.wine ,FUN=prune.misclass )
names(cross_validation_tree)
## [1] "size"   "dev"    "k"      "method"
cross_validation_tree # dev corresponds to the cross-validation error rate in this instance
## $size
## [1] 10  8  7  5  1
## 
## $dev
## [1]  986  986  980  987 1069
## 
## $k
## [1]  -Inf  0.00  6.00  7.50 24.75
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
plot(cross_validation_tree$size,cross_validation_tree$dev ,type="b")

# We now apply the prune.misclass() function in order to prune the tree to prune.obtain the five-node tree.
# print out the best tree node size
cross_validation_tree$size[which.min(cross_validation_tree$dev)]
## [1] 7
prune.wine =prune.misclass (tree.wine, best =cross_validation_tree$size[which.min(cross_validation_tree$dev)])
plot(prune.wine)
text(prune.wine,pretty =0)

Purned Tree Evaluation

purn.pred=predict(prune.wine, testing_set, type="class")

# Calculate the accuracy: 
caret::confusionMatrix(purn.pred, testing_set$quality_degree)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low Mid
##       High   44   0  30
##       Low     1   8   4
##       Mid   201  43 649
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7153          
##                  95% CI : (0.6859, 0.7434)
##     No Information Rate : 0.6969          
##     P-Value [Acc > NIR] : 0.1115          
##                                           
##                   Kappa : 0.1755          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: High Class: Low Class: Mid
## Sensitivity              0.17886   0.156863     0.9502
## Specificity              0.95913   0.994618     0.1785
## Pos Pred Value           0.59459   0.615385     0.7268
## Neg Pred Value           0.77704   0.955533     0.6092
## Prevalence               0.25102   0.052041     0.6969
## Detection Rate           0.04490   0.008163     0.6622
## Detection Prevalence     0.07551   0.013265     0.9112
## Balanced Accuracy        0.56899   0.575740     0.5643
# Accoring to the results, the purned tree have same accuracy as the original tree
# using the purining method, we could make this tree less complicated while keeping the accuracy 

Random Froest

set.seed(213)
rf1=randomForest(quality_degree ~.,data=training_set, mtry=4, importance=TRUE, ntree=500)
summary(rf1)
##                 Length Class  Mode     
## call                6  -none- call     
## type                1  -none- character
## predicted        3918  factor numeric  
## err.rate         2000  -none- numeric  
## confusion          12  -none- numeric  
## votes           11754  matrix numeric  
## oob.times        3918  -none- numeric  
## classes             3  -none- character
## importance         55  -none- numeric  
## importanceSD       44  -none- numeric  
## localImportance     0  -none- NULL     
## proximity           0  -none- NULL     
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             14  -none- list     
## y                3918  factor numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL     
## terms               3  terms  call

Random Forest Evaluation Using Test Set

predict_rf = predict(rf1, newdata = testing_set)
# calculate the accuracy and error rate 
accuracy=mean(testing_set$quality_degree == predict_rf)
error=mean(testing_set$quality_degree != predict_rf)
# print out the accuracy and error rate 
cat('accuracy is ', accuracy, "\n")
## accuracy is  0.9122449
cat('error rate is ', error)
## error rate is  0.0877551

Bagging

set.seed(210)
rf2=randomForest(quality_degree ~.,data=training_set, mtry=11, importance=TRUE, ntree=500)
summary(rf2)
##                 Length Class  Mode     
## call                6  -none- call     
## type                1  -none- character
## predicted        3918  factor numeric  
## err.rate         2000  -none- numeric  
## confusion          12  -none- numeric  
## votes           11754  matrix numeric  
## oob.times        3918  -none- numeric  
## classes             3  -none- character
## importance         55  -none- numeric  
## importanceSD       44  -none- numeric  
## localImportance     0  -none- NULL     
## proximity           0  -none- NULL     
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             14  -none- list     
## y                3918  factor numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL     
## terms               3  terms  call
predict_rf2 = predict(rf2, newdata = testing_set)
# calculate the accuracy and error rate 
accuracy=mean(testing_set$quality_degree == predict_rf2)
error=mean(testing_set$quality_degree != predict_rf2)
# print out the accuracy and error rate 
cat('accuracy is ', accuracy, "\n")
## accuracy is  0.9183673
cat('error rate is ', error)
## error rate is  0.08163265

Find the importance of variables

importance(rf2)
##                           High       Low      Mid MeanDecreaseAccuracy
## fixed.acidity         59.13778  53.06202 61.35774             77.51511
## volatile.acidity      99.96748  73.21482 90.86874            126.71534
## citric.acid           58.36603  39.33573 55.18533             73.83397
## residual.sugar        61.51853  50.24703 55.27263             79.26795
## chlorides             72.55040  42.69326 59.12336             85.90796
## free.sulfur.dioxide   67.86080  91.70688 87.12803            115.48873
## total.sulfur.dioxide  58.63962  48.07500 56.18040             82.17094
## density               48.00448  31.67580 44.49665             62.53439
## pH                    76.99637  44.20504 71.49911             90.68676
## sulphates             78.03483  45.83338 67.42694             89.77290
## alcohol              149.65674 107.43122 94.02303            195.73010
##                      MeanDecreaseGini
## fixed.acidity                129.2981
## volatile.acidity             132.9597
## citric.acid                  112.0393
## residual.sugar               132.4474
## chlorides                    124.3271
## free.sulfur.dioxide          195.8889
## total.sulfur.dioxide         130.6064
## density                      119.4986
## pH                           135.2123
## sulphates                    141.0688
## alcohol                      295.8018
varImpPlot(rf2)

GAM model

library(dplyr)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.16.1

Read the data

data = read.csv("wine_white.csv")
# step 1: resample the data 
set.seed(213)
table(data$quality)
## 
##    3    4    5    6    7    8    9 
##   20  163 1457 2198  880  175    5
data <- data[sample(1:nrow(data),nrow(data),prob = ifelse(data$quality == c("9","3"), 0.95 , 0.10), replace = TRUE),]
table(data$quality)
## 
##    3    4    5    6    7    8    9 
##  106  140 1412 2183  850  176   31
# step 2: split the categorical labels into 2 group: Above_Average and Below_Average
data = data %>% mutate(quality_degree = case_when(quality >= 6 ~ 'Above_Average', TRUE ~ 'Below_Average'))
data=data[, c(1:11, 13)]
data$quality_degree=as.factor(data$quality_degree)

GAM method 1: GAM model with 3 most important variables

df_gam=data[, c("alcohol", "volatile.acidity","free.sulfur.dioxide", "quality_degree")]
summary(df_gam)
##     alcohol      volatile.acidity free.sulfur.dioxide       quality_degree
##  Min.   : 8.00   Min.   :0.0800   Min.   :  2.00      Above_Average:3240  
##  1st Qu.: 9.50   1st Qu.:0.2100   1st Qu.: 24.00      Below_Average:1658  
##  Median :10.40   Median :0.2600   Median : 34.00                          
##  Mean   :10.53   Mean   :0.2772   Mean   : 35.91                          
##  3rd Qu.:11.40   3rd Qu.:0.3200   3rd Qu.: 46.00                          
##  Max.   :14.20   Max.   :1.0050   Max.   :289.00

Split the data into training set and testing set

set.seed(13)
#dim(data)
training_index=sample(1:dim(df_gam)[1], dim(df_gam)[1]*0.8)
testing_index=-training_index

training_set=df_gam[training_index, ]
testing_set=df_gam[testing_index, ]
# summary(training_set)

Find best degree of freedom

df_parameter=data.frame(model=c(paste("gam", 2:6)), test.accuracy=NA)

for (mydegree in 2:6){
gam1=gam(I(quality_degree=='Above_Average') ~ s(alcohol,mydegree) + s(volatile.acidity, mydegree)+ s(free.sulfur.dioxide,mydegree), family=binomial, data=training_set)
# Evaluate the model   
# Prediction
pred_gam <- predict(gam1, newdata=testing_set, type="response")
# Confusion Matrix 
cm_gam <- as.matrix(table(Actual=testing_set$quality_degree, Predicted=pred_gam > 0.5))
cm_gam
# Accuracy
acc_gam <- sum(cm_gam[,2])/ sum(cm_gam)
cat("Accuracy: ", acc_gam)
df_parameter$test.accuracy[mydegree-1]=acc_gam
}
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.7397959
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.7142857
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.7071429
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.705102
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.7061224
df_parameter
##   model test.accuracy
## 1 gam 2     0.7397959
## 2 gam 3     0.7142857
## 3 gam 4     0.7071429
## 4 gam 5     0.7051020
## 5 gam 6     0.7061224

Build GAM model with 3 most important variables

gam1=gam(I(quality_degree=='Above_Average') ~ s(alcohol,2) + s(volatile.acidity, 2)+ s(free.sulfur.dioxide,2), family=binomial, data=training_set)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam1)
## 
## Call: gam(formula = I(quality_degree == "Above_Average") ~ s(alcohol, 
##     2) + s(volatile.acidity, 2) + s(free.sulfur.dioxide, 2), 
##     family = binomial, data = training_set)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7577 -0.9496  0.4643  0.8240  2.0818 
## 
## (Dispersion Parameter for binomial family taken to be 1)
## 
##     Null Deviance: 5022.995 on 3917 degrees of freedom
## Residual Deviance: 4009.184 on 3911 degrees of freedom
## AIC: 4023.183 
## 
## Number of Local Scoring Iterations: 18 
## 
## Anova for Parametric Effects
##                             Df Sum Sq Mean Sq F value    Pr(>F)    
## s(alcohol, 2)                1  425.9  425.88 442.642 < 2.2e-16 ***
## s(volatile.acidity, 2)       1  186.3  186.33 193.664 < 2.2e-16 ***
## s(free.sulfur.dioxide, 2)    1   28.7   28.73  29.857 4.942e-08 ***
## Residuals                 3911 3762.9    0.96                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                           Npar Df Npar Chisq    P(Chi)    
## (Intercept)                                               
## s(alcohol, 2)                   1      5.917   0.01499 *  
## s(volatile.acidity, 2)          1     31.114 2.433e-08 ***
## s(free.sulfur.dioxide, 2)       1    119.203 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(3,3))
plot(gam1, se=T, col='red')

evaluate the model

# Prediction
pred_gam <- predict(gam1, newdata=testing_set, type="response")

# Confusion Matrix 
cm_gam <- as.matrix(table(Actual=testing_set$quality_degree, Predicted=pred_gam > 0.5))
cm_gam
##                Predicted
## Actual          FALSE TRUE
##   Above_Average    87  567
##   Below_Average   168  158
# Accuracy
acc_gam <- sum(cm_gam[,2])/ sum(cm_gam)
cat("Accuracy: ", acc_gam)
## Accuracy:  0.7397959

Method 2: Build GAM model with 9 predictors

data = read.csv("wine_white.csv")
heatmap(cor(data))

# step 1: resample the data 
set.seed(213)
table(data$quality)
## 
##    3    4    5    6    7    8    9 
##   20  163 1457 2198  880  175    5
data <- data[sample(1:nrow(data),nrow(data),prob = ifelse(data$quality == c("9","3"), 0.95 , 0.10), replace = TRUE),]
table(data$quality)
## 
##    3    4    5    6    7    8    9 
##  106  140 1412 2183  850  176   31
# step 2: split the categorical labels into 2 group: Above_Average and Below_Average
data = data %>% mutate(quality_degree = case_when(quality >= 6 ~ 'Above_Average', TRUE ~ 'Below_Average'))
data=data[, c(1:11, 13)]
data$quality_degree=as.factor(data$quality_degree)

splid the data into training set and testing set

df_gam=data 
set.seed(13)
#dim(data)
training_index=sample(1:dim(df_gam)[1], dim(df_gam)[1]*0.8)
testing_index=-training_index

training_set=df_gam[training_index, ]
testing_set=df_gam[testing_index, ]
# summary(training_set)

GAM model with 9 features

df_parameter=data.frame(model=c(paste("gam", 2:6)), test.accuracy=NA)

for (mydegree in 2:6){
  gam=gam(I(quality_degree=='Above_Average') ~ s(fixed.acidity, mydegree)+s(volatile.acidity, mydegree)+s(citric.acid, mydegree) + s(residual.sugar,mydegree)+s(chlorides,mydegree) +s( free.sulfur.dioxide, mydegree) +s(pH, mydegree) + s(sulphates,mydegree)+s(alcohol,mydegree), family=binomial, data=training_set)
# Evaluate the model   
# Prediction
pred_gam <- predict(gam, newdata=testing_set, type="response")
# Confusion Matrix 
cm_gam <- as.matrix(table(Actual=testing_set$quality_degree, Predicted=pred_gam > 0.5))
cm_gam
# Accuracy
acc_gam <- sum(cm_gam[,2])/ sum(cm_gam)
cat("Accuracy: ", acc_gam)
df_parameter$test.accuracy[mydegree-1]=acc_gam
}
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.7346939
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.7244898
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.7040816
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.7081633
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy:  0.7010204
df_parameter
##   model test.accuracy
## 1 gam 2     0.7346939
## 2 gam 3     0.7244898
## 3 gam 4     0.7040816
## 4 gam 5     0.7081633
## 5 gam 6     0.7010204
gam2=gam(I(quality_degree=='Above_Average') ~ s(fixed.acidity, 2)+s(volatile.acidity, 2)+s(citric.acid, 2) + s(residual.sugar,2)+s(chlorides,2) +s(free.sulfur.dioxide, 2) +s(pH, 2) + s(sulphates,2)+s(alcohol,2), family=binomial, data=training_set)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam2)
## 
## Call: gam(formula = I(quality_degree == "Above_Average") ~ s(fixed.acidity, 
##     2) + s(volatile.acidity, 2) + s(citric.acid, 2) + s(residual.sugar, 
##     2) + s(chlorides, 2) + s(free.sulfur.dioxide, 2) + s(pH, 
##     2) + s(sulphates, 2) + s(alcohol, 2), family = binomial, 
##     data = training_set)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9711 -0.8913  0.4395  0.8019  2.2180 
## 
## (Dispersion Parameter for binomial family taken to be 1)
## 
##     Null Deviance: 5022.995 on 3917 degrees of freedom
## Residual Deviance: 3877.554 on 3899 degrees of freedom
## AIC: 3915.553 
## 
## Number of Local Scoring Iterations: 15 
## 
## Anova for Parametric Effects
##                             Df Sum Sq Mean Sq  F value    Pr(>F)    
## s(fixed.acidity, 2)          1   12.3   12.34  12.7384 0.0003625 ***
## s(volatile.acidity, 2)       1   85.7   85.67  88.4664 < 2.2e-16 ***
## s(citric.acid, 2)            1    0.8    0.77   0.7994 0.3713353    
## s(residual.sugar, 2)         1   15.9   15.87  16.3836 5.273e-05 ***
## s(chlorides, 2)              1   32.5   32.46  33.5182 7.616e-09 ***
## s(free.sulfur.dioxide, 2)    1    0.0    0.00   0.0002 0.9891921    
## s(pH, 2)                     1    1.7    1.73   1.7898 0.1810324    
## s(sulphates, 2)              1   10.4   10.36  10.6975 0.0010821 ** 
## s(alcohol, 2)                1  456.4  456.37 471.2703 < 2.2e-16 ***
## Residuals                 3899 3775.7    0.97                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                           Npar Df Npar Chisq    P(Chi)    
## (Intercept)                                               
## s(fixed.acidity, 2)             1      8.839  0.002948 ** 
## s(volatile.acidity, 2)          1     35.342 2.766e-09 ***
## s(citric.acid, 2)               1      8.761  0.003076 ** 
## s(residual.sugar, 2)            1      2.136  0.143889    
## s(chlorides, 2)                 1     21.572 3.408e-06 ***
## s(free.sulfur.dioxide, 2)       1     89.185 < 2.2e-16 ***
## s(pH, 2)                        1      1.970  0.160380    
## s(sulphates, 2)                 1      3.705  0.054238 .  
## s(alcohol, 2)                   1      1.551  0.212991    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(3,4))
plot(gam2, se=T, col='red')

evaluate the model

# Prediction
pred_gam <- predict(gam2, newdata=testing_set, type="response")

# Confusion Matrix 
cm_gam <- as.matrix(table(Actual=testing_set$quality_degree, Predicted=pred_gam > 0.5))
cm_gam
##                Predicted
## Actual          FALSE TRUE
##   Above_Average    82  572
##   Below_Average   178  148
# Accuracy
acc_gam <- sum(cm_gam[,2])/ sum(cm_gam)
cat("Accuracy: ", acc_gam)
## Accuracy:  0.7346939

Lasso Regression for Predictor Selection

set.seed(1234)
wine1 <- read.csv("wine_white.csv")
wine.df <- as.data.frame(wine1)
wine1 = wine1 %>% mutate(quality = as.factor(quality))
table(wine1$quality)
## 
##    3    4    5    6    7    8    9 
##   20  163 1457 2198  880  175    5
### we can see there the data is imbalanced.
### to deal with this data imbalance we will use oversampling 
wine1 <- wine1[sample(1:nrow(wine1),nrow(wine1),prob = ifelse(wine$quality == c("9","3"), 0.95 , 0.10), replace = TRUE),]
table(wine1$quality)
## 
##    3    4    5    6    7    8    9 
##  121  139 1417 2155  840  197   29
train_index = sample(1: nrow(wine1),0.8*nrow(wine1))
train = wine1[train_index,]
test = wine1[-train_index,]
###Lasso Regression to do predictor selection 

x = model.matrix(quality ~. , wine1)
y = wine1$quality
y.test = y[-train_index]
x.test = x[-train_index,]
y.train = y[train_index]
x.train = x[train_index,]
lasso.mod=glmnet(x.train,y.train,alpha=1,family = "multinomial",type.measure = "class")
plot(lasso.mod)

set.seed(2111)
cv.out=cv.glmnet(x.train , y.train ,alpha=1 ,family = "multinomial",type.measure = "class")
plot(cv.out)

bestlam =cv.out$lambda.min
lasso.pred=predict (lasso.mod ,s= bestlam ,newx=x.test)
mean((lasso.pred -y.test)^2)
## Warning in Ops.factor(lasso.pred, y.test): '-' not meaningful for factors
## [1] NA
out=glmnet(x , y,alpha=1,family = "multinomial",type.measure = "class")
lasso.coef=predict (out ,type="coefficients",s=bestlam)
lasso.coef
## $`3`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                                  1
## (Intercept)          -2.038391e+02
## (Intercept)           .           
## fixed.acidity         5.598210e-01
## volatile.acidity      1.544162e+00
## citric.acid          -1.522891e-01
## residual.sugar       -1.069421e-01
## chlorides             .           
## free.sulfur.dioxide   1.521637e-02
## total.sulfur.dioxide  8.130362e-03
## density               7.998550e+01
## pH                    1.017839e-01
## sulphates            -5.320139e+00
## alcohol              -2.527539e-02
## 
## $`4`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                                  1
## (Intercept)          -2.866734e+02
## (Intercept)           .           
## fixed.acidity        -3.558959e-01
## volatile.acidity      4.859123e+00
## citric.acid           2.868932e-01
## residual.sugar       -2.010750e-01
## chlorides             3.441476e+00
## free.sulfur.dioxide  -5.490122e-02
## total.sulfur.dioxide -1.506094e-03
## density               1.886115e+02
## pH                   -3.441663e+00
## sulphates            -1.771338e+00
## alcohol              -6.230058e-01
## 
## $`5`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                                  1
## (Intercept)          -1.127979e+02
## (Intercept)           .           
## fixed.acidity        -2.379319e-01
## volatile.acidity      7.425178e-01
## citric.acid           1.255643e-01
## residual.sugar       -6.332946e-02
## chlorides             3.119120e-02
## free.sulfur.dioxide  -8.886790e-03
## total.sulfur.dioxide  3.560308e-03
## density               1.437077e+01
## pH                   -2.868947e+00
## sulphates            -1.683779e+00
## alcohol              -8.317170e-01
## 
## $`6`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                                  1
## (Intercept)          -1.057067e+02
## (Intercept)           .           
## fixed.acidity        -2.998124e-01
## volatile.acidity     -4.502309e+00
## citric.acid          -4.225328e-01
## residual.sugar        .           
## chlorides            -1.419198e+00
## free.sulfur.dioxide  -1.405314e-03
## total.sulfur.dioxide  1.065751e-03
## density               .           
## pH                   -2.897014e+00
## sulphates             1.816807e-01
## alcohol               1.304029e-02
## 
## $`7`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                                  1
## (Intercept)           3.380622e+02
## (Intercept)           .           
## fixed.acidity         1.549179e-01
## volatile.acidity     -7.172379e+00
## citric.acid          -1.595888e+00
## residual.sugar        2.133992e-01
## chlorides            -1.158916e+01
## free.sulfur.dioxide   .           
## total.sulfur.dioxide -3.274419e-04
## density              -4.615320e+02
## pH                    .           
## sulphates             2.333521e+00
## alcohol               7.265307e-02
## 
## $`8`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                                  1
## (Intercept)           2.723865e+02
## (Intercept)           .           
## fixed.acidity         .           
## volatile.acidity     -7.156854e+00
## citric.acid           .           
## residual.sugar        2.407980e-01
## chlorides             5.484369e+00
## free.sulfur.dioxide   9.851367e-03
## total.sulfur.dioxide  .           
## density              -4.010822e+02
## pH                    4.629667e-02
## sulphates             .           
## alcohol               4.676187e-01
## 
## $`9`
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                                  1
## (Intercept)            98.56832794
## (Intercept)             .         
## fixed.acidity           2.46338102
## volatile.acidity        .         
## citric.acid             0.40586922
## residual.sugar          0.37889867
## chlorides            -289.67223963
## free.sulfur.dioxide     0.06032348
## total.sulfur.dioxide   -0.03565604
## density              -287.74588347
## pH                     16.75761392
## sulphates               2.87349833
## alcohol                 .
######for each class 
##### knn
list.k = c(1:20)
res.acc = c()
for (val in list.k) {
  knn.pred=knn(x.train,x.test,y.train ,k=val)
  acc = mean(knn.pred == y.test)
  res.acc[val] = acc
}
plot(list.k,res.acc)

res.acc[1]
## [1] 0.7765306
###according to the graph, we should use k = 1 as the paremeter for the knn method which accuracy is 0.787755

SVM

#split data
library(caret)
set.seed(22202)
train=createDataPartition(wine$quality, p = 0.7, list = FALSE) 
test =-train
winetrain = wine[train,]
winetest =wine[test,]
X.train = wine[train,1:11]
Y.train =wine[train,12]
X.test = wine[test,1:11] 
Y.test = wine[test,12]
#svm
#kernal=radial
library(e1071)
svm_model= svm(quality ~ . , data = winetrain,kernel='radial')
svm_result = predict(svm_model, newdata = winetest[,!colnames(winetest) %in% c("quality")])
confusionMatrix(svm_result, winetest$quality)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   0   0   0   0   0   0
##          4   0   2   3   0   0   0   0
##          5   1  30 257 104   6   1   0
##          6   5  16 176 536 196  43   0
##          7   0   0   1  19  62   8   1
##          8   0   0   0   0   0   0   0
##          9   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5842          
##                  95% CI : (0.5585, 0.6096)
##     No Information Rate : 0.4492          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3184          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000 0.041667   0.5881   0.8134  0.23485  0.00000
## Specificity           1.00000 0.997886   0.8621   0.4604  0.97589  1.00000
## Pos Pred Value            NaN 0.400000   0.6441   0.5514  0.68132      NaN
## Neg Pred Value        0.99591 0.968536   0.8315   0.7515  0.85320  0.96455
## Prevalence            0.00409 0.032720   0.2979   0.4492  0.17996  0.03545
## Detection Rate        0.00000 0.001363   0.1752   0.3654  0.04226  0.00000
## Detection Prevalence  0.00000 0.003408   0.2720   0.6626  0.06203  0.00000
## Balanced Accuracy     0.50000 0.519776   0.7251   0.6369  0.60537  0.50000
##                       Class: 9
## Sensitivity          0.0000000
## Specificity          1.0000000
## Pos Pred Value             NaN
## Neg Pred Value       0.9993183
## Prevalence           0.0006817
## Detection Rate       0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy    0.5000000
#kernal=sigmoid
svm_model1= svm(quality ~ . , data = winetrain,kernel='sigmoid')
svm_result1 = predict(svm_model1, newdata = winetest[,!colnames(winetest) %in% c("quality")])
confusionMatrix(svm_result1, winetest$quality)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   0   0   0   0   0   0
##          4   0   2  14   6   0   0   0
##          5   1  24 210 180  27   3   0
##          6   4  18 181 353 145  27   0
##          7   0   4  32 119  88  21   1
##          8   1   0   0   1   4   1   0
##          9   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4458          
##                  95% CI : (0.4202, 0.4717)
##     No Information Rate : 0.4492          
##     P-Value [Acc > NIR] : 0.6133          
##                                           
##                   Kappa : 0.152           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7
## Sensitivity           0.00000 0.041667   0.4805   0.5357  0.33333
## Specificity           1.00000 0.985906   0.7718   0.5359  0.85287
## Pos Pred Value            NaN 0.090909   0.4719   0.4849  0.33208
## Neg Pred Value        0.99591 0.968166   0.7779   0.5859  0.85358
## Prevalence            0.00409 0.032720   0.2979   0.4492  0.17996
## Detection Rate        0.00000 0.001363   0.1431   0.2406  0.05999
## Detection Prevalence  0.00000 0.014997   0.3033   0.4963  0.18064
## Balanced Accuracy     0.50000 0.513786   0.6262   0.5358  0.59310
##                       Class: 8  Class: 9
## Sensitivity          0.0192308 0.0000000
## Specificity          0.9957597 1.0000000
## Pos Pred Value       0.1428571       NaN
## Neg Pred Value       0.9650685 0.9993183
## Prevalence           0.0354465 0.0006817
## Detection Rate       0.0006817 0.0000000
## Detection Prevalence 0.0047716 0.0000000
## Balanced Accuracy    0.5074952 0.5000000
#kernal=poly
svm_model2= svm(quality ~ . , data = winetrain,kernel='polynomial')
svm_result2 = predict(svm_model2, newdata = winetest[,!colnames(winetest) %in% c("quality")])
confusionMatrix(svm_result2, winetest$quality)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   0   0   0   0   0   0
##          4   0  10   7   2   0   0   0
##          5   1  19 155  63   7   0   0
##          6   5  19 273 584 219  47   0
##          7   0   0   2   9  38   5   1
##          8   0   0   0   0   0   0   0
##          9   0   0   0   1   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5365          
##                  95% CI : (0.5106, 0.5622)
##     No Information Rate : 0.4492          
##     P-Value [Acc > NIR] : 1.277e-11       
##                                           
##                   Kappa : 0.2168          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000 0.208333   0.3547   0.8862  0.14394  0.00000
## Specificity           1.00000 0.993658   0.9126   0.3032  0.98587  1.00000
## Pos Pred Value            NaN 0.526316   0.6327   0.5092  0.69091      NaN
## Neg Pred Value        0.99591 0.973757   0.7692   0.7656  0.83994  0.96455
## Prevalence            0.00409 0.032720   0.2979   0.4492  0.17996  0.03545
## Detection Rate        0.00000 0.006817   0.1057   0.3981  0.02590  0.00000
## Detection Prevalence  0.00000 0.012952   0.1670   0.7819  0.03749  0.00000
## Balanced Accuracy     0.50000 0.600995   0.6337   0.5947  0.56490  0.50000
##                       Class: 9
## Sensitivity          0.0000000
## Specificity          0.9993179
## Pos Pred Value       0.0000000
## Neg Pred Value       0.9993179
## Prevalence           0.0006817
## Detection Rate       0.0000000
## Detection Prevalence 0.0006817
## Balanced Accuracy    0.4996589
#kernal=linear
svm_model3= svm(quality ~ . , data = winetrain,kernel='linear')
svm_result3 = predict(svm_model3, newdata = winetest[,!colnames(winetest) %in% c("quality")])
confusionMatrix(svm_result3, winetest$quality)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   0   0   0   0   0   0
##          4   0   0   0   0   0   0   0
##          5   1  29 229 117  11   3   0
##          6   5  19 208 542 253  49   1
##          7   0   0   0   0   0   0   0
##          8   0   0   0   0   0   0   0
##          9   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5256          
##                  95% CI : (0.4996, 0.5514)
##     No Information Rate : 0.4492          
##     P-Value [Acc > NIR] : 2.704e-09       
##                                           
##                   Kappa : 0.1972          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000  0.00000   0.5240   0.8225     0.00  0.00000
## Specificity           1.00000  1.00000   0.8437   0.3379     1.00  1.00000
## Pos Pred Value            NaN      NaN   0.5872   0.5032      NaN      NaN
## Neg Pred Value        0.99591  0.96728   0.8069   0.7000     0.82  0.96455
## Prevalence            0.00409  0.03272   0.2979   0.4492     0.18  0.03545
## Detection Rate        0.00000  0.00000   0.1561   0.3695     0.00  0.00000
## Detection Prevalence  0.00000  0.00000   0.2658   0.7342     0.00  0.00000
## Balanced Accuracy     0.50000  0.50000   0.6839   0.5802     0.50  0.50000
##                       Class: 9
## Sensitivity          0.0000000
## Specificity          1.0000000
## Pos Pred Value             NaN
## Neg Pred Value       0.9993183
## Prevalence           0.0006817
## Detection Rate       0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy    0.5000000

Multinomial Logidtic Regression Model

#multinomial logistic regression model
library(nnet)
str(winetrain)
## 'data.frame':    3431 obs. of  12 variables:
##  $ fixed.acidity       : num  7 7.2 7.2 8.1 6.2 7 6.3 8.1 8.6 6.6 ...
##  $ volatile.acidity    : num  0.27 0.23 0.23 0.28 0.32 0.27 0.3 0.22 0.23 0.17 ...
##  $ citric.acid         : num  0.36 0.32 0.32 0.4 0.16 0.36 0.34 0.43 0.4 0.38 ...
##  $ residual.sugar      : num  20.7 8.5 8.5 6.9 7 20.7 1.6 1.5 4.2 1.5 ...
##  $ chlorides           : num  0.045 0.058 0.058 0.05 0.045 0.045 0.049 0.044 0.035 0.032 ...
##  $ free.sulfur.dioxide : num  45 47 47 30 30 45 14 28 17 28 ...
##  $ total.sulfur.dioxide: num  170 186 186 97 136 170 132 129 109 112 ...
##  $ density             : num  1.001 0.996 0.996 0.995 0.995 ...
##  $ pH                  : num  3 3.19 3.19 3.26 3.18 3 3.3 3.22 3.14 3.25 ...
##  $ sulphates           : num  0.45 0.4 0.4 0.44 0.47 0.45 0.49 0.45 0.53 0.55 ...
##  $ alcohol             : num  8.8 9.9 9.9 10.1 9.6 8.8 9.5 11 9.7 11.4 ...
##  $ quality             : Factor w/ 7 levels "3","4","5","6",..: 4 4 4 4 4 4 4 4 3 5 ...
multinomModel=multinom(quality ~ ., data = winetrain)
## # weights:  91 (72 variable)
## initial  value 6676.417721 
## iter  10 value 4504.508732
## iter  20 value 4328.247226
## iter  30 value 4181.580560
## iter  40 value 3772.920846
## iter  50 value 3738.364479
## iter  60 value 3730.892578
## iter  70 value 3728.356930
## iter  80 value 3727.062078
## iter  90 value 3725.710133
## iter 100 value 3724.778463
## final  value 3724.778463 
## stopped after 100 iterations
summary(multinomModel)
## Call:
## multinom(formula = quality ~ ., data = winetrain)
## 
## Coefficients:
##   (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4  -8.5644472    -0.7575642         1.020941    1.119383    0.005782688
## 5  -0.4297904    -1.1978646        -2.965519    1.382383    0.058605511
## 6  -3.9405082    -1.3281131        -8.643098    1.685138    0.112602600
## 7  28.4597424    -1.1830105       -10.549078    0.767677    0.152865511
## 8  10.3949788    -1.1383010       -10.334086    1.249730    0.205061803
## 9 -38.3144078     1.4503718       -13.830056    1.822825    0.283671177
##   chlorides free.sulfur.dioxide total.sulfur.dioxide   density        pH
## 4 -10.82038        -0.092042259          0.013560405  19.81470 -1.066912
## 5 -11.70640        -0.055031198          0.016964964  24.71136 -3.246355
## 6 -11.69285        -0.047367193          0.014928454  20.52416 -2.994665
## 7 -31.53422        -0.038735252          0.012195765 -22.58238 -1.919272
## 8 -25.55608        -0.019816012          0.011299295 -14.47209 -1.182091
## 9 -47.73206         0.007218965         -0.001483365 -40.51331 11.797128
##    sulphates    alcohol
## 4 -0.6692671 0.11387371
## 5 -1.7145426 0.02841294
## 6 -0.3896408 0.87998174
## 7  0.9662256 1.41158756
## 8  0.6485456 1.81007572
## 9 -3.9754228 3.00644027
## 
## Std. Errors:
##   (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4   0.1261435     0.2492844       0.68353816  0.70178743     0.06521474
## 5   0.5111369     0.2385980       0.44570858  0.36419400     0.06110678
## 6   0.4297549     0.2387147       0.41812528  0.33162194     0.06107124
## 7   0.5495636     0.2435921       0.54927574  0.45840138     0.06190275
## 8   0.1081364     0.2584410       0.93185816  0.75779797     0.06392709
## 9   0.1666752     0.4944409       0.03573703  0.07221853     0.09643979
##    chlorides free.sulfur.dioxide total.sulfur.dioxide   density        pH
## 4 0.06515046          0.01621825          0.008808593 0.1256969 0.4512610
## 5 0.96828679          0.01345030          0.008397584 0.5024334 0.4421693
## 6 0.95972836          0.01329874          0.008386623 0.4223883 0.4149276
## 7 0.05037647          0.01352164          0.008498867 0.5417329 0.4476471
## 8 0.01210080          0.01370128          0.009070917 0.1070391 0.4593520
## 9 0.01644169          0.03830606          0.022564035 0.1693621 1.0846948
##   sulphates   alcohol
## 4 0.7960803 0.2133457
## 5 0.4028178 0.1976410
## 6 0.3295562 0.1956629
## 7 0.3814666 0.1980105
## 8 0.6152199 0.2103405
## 9 0.2679084 0.3743991
## 
## Residual Deviance: 7449.557 
## AIC: 7593.557
#predict
predict_class=predict(multinomModel,winetest)
#Misclassification error
cm = table(predict_class, winetest$quality) 
confusionMatrix(predict_class, winetest$quality)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   3   4   5   6   7   8   9
##          3   0   0   0   0   0   0   0
##          4   0   4   3   2   0   0   0
##          5   2  25 231 113  14   6   0
##          6   4  19 200 498 188  37   0
##          7   0   0   3  45  62   9   1
##          8   0   0   0   0   0   0   0
##          9   0   0   0   1   0   0   0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5419         
##                  95% CI : (0.516, 0.5677)
##     No Information Rate : 0.4492         
##     P-Value [Acc > NIR] : 6.785e-13      
##                                          
##                   Kappa : 0.2564         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000 0.083333   0.5286   0.7557  0.23485  0.00000
## Specificity           1.00000 0.996476   0.8447   0.4455  0.95179  1.00000
## Pos Pred Value            NaN 0.444444   0.5908   0.5264  0.51667      NaN
## Neg Pred Value        0.99591 0.969822   0.8086   0.6910  0.85004  0.96455
## Prevalence            0.00409 0.032720   0.2979   0.4492  0.17996  0.03545
## Detection Rate        0.00000 0.002727   0.1575   0.3395  0.04226  0.00000
## Detection Prevalence  0.00000 0.006135   0.2665   0.6449  0.08180  0.00000
## Balanced Accuracy     0.50000 0.539905   0.6866   0.6006  0.59332  0.50000
##                       Class: 9
## Sensitivity          0.0000000
## Specificity          0.9993179
## Pos Pred Value       0.0000000
## Neg Pred Value       0.9993179
## Prevalence           0.0006817
## Detection Rate       0.0000000
## Detection Prevalence 0.0006817
## Balanced Accuracy    0.4996589
#multinomial logistic regression model using mlogit()
library(mlogit)
## Loading required package: Formula
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: lmtest
H <- mlogit.data(winetrain, choice = "quality", shape = "wide")
mlogitmodel <- mlogit(quality ~ 1 | fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density + pH + sulphates + alcohol, data = H)

mlogitmodel
## 
## Call:
## mlogit(formula = quality ~ 1 | fixed.acidity + volatile.acidity +     citric.acid + residual.sugar + chlorides + free.sulfur.dioxide +     total.sulfur.dioxide + density + pH + sulphates + alcohol,     data = H, method = "nr")
## 
## Coefficients:
##          4:(intercept)           5:(intercept)           6:(intercept)  
##            -2.8590e+02             -8.4188e+01             -9.5809e+00  
##          7:(intercept)           8:(intercept)           9:(intercept)  
##             5.4995e+02              7.8993e+02              3.1936e+03  
##        4:fixed.acidity         5:fixed.acidity         6:fixed.acidity  
##            -1.1086e+00             -1.3841e+00             -1.4564e+00  
##        7:fixed.acidity         8:fixed.acidity         9:fixed.acidity  
##            -9.0957e-01             -6.7954e-01              6.0766e+00  
##     4:volatile.acidity      5:volatile.acidity      6:volatile.acidity  
##             2.5623e-01             -3.5165e+00             -9.1329e+00  
##     7:volatile.acidity      8:volatile.acidity      9:volatile.acidity  
##            -1.0969e+01             -1.0634e+01             -2.2988e+01  
##          4:citric.acid           5:citric.acid           6:citric.acid  
##             5.3941e-01              9.8451e-01              1.3139e+00  
##          7:citric.acid           8:citric.acid           9:citric.acid  
##             4.6225e-01              9.5835e-01              2.0964e+00  
##       4:residual.sugar        5:residual.sugar        6:residual.sugar  
##            -1.3451e-01              1.8101e-02              1.0231e-01  
##       7:residual.sugar        8:residual.sugar        9:residual.sugar  
##             3.3452e-01              4.9144e-01              1.4790e+00  
##            4:chlorides             5:chlorides             6:chlorides  
##            -9.8593e+00             -1.0064e+01             -9.5982e+00  
##            7:chlorides             8:chlorides             9:chlorides  
##            -2.4082e+01             -1.5080e+01             -2.9459e+02  
##  4:free.sulfur.dioxide   5:free.sulfur.dioxide   6:free.sulfur.dioxide  
##            -8.6527e-02             -5.3931e-02             -4.6736e-02  
##  7:free.sulfur.dioxide   8:free.sulfur.dioxide   9:free.sulfur.dioxide  
##            -4.1125e-02             -2.3049e-02             -3.8749e-02  
## 4:total.sulfur.dioxide  5:total.sulfur.dioxide  6:total.sulfur.dioxide  
##             1.1786e-02              1.5538e-02              1.3735e-02  
## 7:total.sulfur.dioxide  8:total.sulfur.dioxide  9:total.sulfur.dioxide  
##             1.3421e-02              1.3333e-02              4.1507e-02  
##              4:density               5:density               6:density  
##             3.0779e+02              1.1458e+02              3.1067e+01  
##              7:density               8:density               9:density  
##            -5.4652e+02             -8.0023e+02             -3.3686e+03  
##                   4:pH                    5:pH                    6:pH  
##            -3.3613e+00             -4.5219e+00             -3.9704e+00  
##                   7:pH                    8:pH                    9:pH  
##            -1.1829e+00              3.8343e-01              3.2454e+01  
##            4:sulphates             5:sulphates             6:sulphates  
##            -9.4671e-01             -1.5138e+00             -6.8293e-02  
##            7:sulphates             8:sulphates             9:sulphates  
##             1.9959e+00              2.0382e+00             -2.3890e+00  
##              4:alcohol               5:alcohol               6:alcohol  
##             3.2991e-01              6.3759e-02              8.2709e-01  
##              7:alcohol               8:alcohol               9:alcohol  
##             7.7186e-01              8.9899e-01              2.8399e-01
summary(mlogitmodel)
## 
## Call:
## mlogit(formula = quality ~ 1 | fixed.acidity + volatile.acidity + 
##     citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + 
##     total.sulfur.dioxide + density + pH + sulphates + alcohol, 
##     data = H, method = "nr")
## 
## Frequencies of alternatives:
##         3         4         5         6         7         8         9 
## 0.0040804 0.0335179 0.2972894 0.4485573 0.1795395 0.0358496 0.0011658 
## 
## nr method
## 12 iterations, 0h:0m:4s 
## g'(-H)^-1g = 0.000107 
## successive function values within tolerance limits 
## 
## Coefficients :
##                           Estimate  Std. Error z-value  Pr(>|z|)    
## 4:(intercept)          -2.8590e+02  5.0714e+02 -0.5637 0.5729275    
## 5:(intercept)          -8.4188e+01  5.0122e+02 -0.1680 0.8666090    
## 6:(intercept)          -9.5809e+00  5.0103e+02 -0.0191 0.9847435    
## 7:(intercept)           5.4995e+02  5.1117e+02  1.0759 0.2819874    
## 8:(intercept)           7.8993e+02  5.5591e+02  1.4210 0.1553261    
## 9:(intercept)           3.1936e+03  1.9163e+03  1.6665 0.0956044 .  
## 4:fixed.acidity        -1.1086e+00  5.5301e-01 -2.0047 0.0449958 *  
## 5:fixed.acidity        -1.3841e+00  5.3692e-01 -2.5779 0.0099403 ** 
## 6:fixed.acidity        -1.4564e+00  5.3686e-01 -2.7129 0.0066704 ** 
## 7:fixed.acidity        -9.0957e-01  5.4489e-01 -1.6693 0.0950613 .  
## 8:fixed.acidity        -6.7954e-01  5.8350e-01 -1.1646 0.2441844    
## 9:fixed.acidity         6.0766e+00  2.5999e+00  2.3373 0.0194262 *  
## 4:volatile.acidity      2.5623e-01  2.2463e+00  0.1141 0.9091846    
## 5:volatile.acidity     -3.5165e+00  2.1945e+00 -1.6024 0.1090576    
## 6:volatile.acidity     -9.1329e+00  2.2216e+00 -4.1110 3.939e-05 ***
## 7:volatile.acidity     -1.0969e+01  2.2839e+00 -4.8027 1.565e-06 ***
## 8:volatile.acidity     -1.0634e+01  2.5123e+00 -4.2326 2.310e-05 ***
## 9:volatile.acidity     -2.2988e+01  1.0971e+01 -2.0953 0.0361449 *  
## 4:citric.acid           5.3941e-01  2.4573e+00  0.2195 0.8262508    
## 5:citric.acid           9.8451e-01  2.3298e+00  0.4226 0.6726023    
## 6:citric.acid           1.3139e+00  2.3343e+00  0.5629 0.5735092    
## 7:citric.acid           4.6225e-01  2.3777e+00  0.1944 0.8458575    
## 8:citric.acid           9.5835e-01  2.5127e+00  0.3814 0.7029060    
## 9:citric.acid           2.0964e+00  3.8345e+00  0.5467 0.5845619    
## 4:residual.sugar       -1.3451e-01  1.9598e-01 -0.6863 0.4925030    
## 5:residual.sugar        1.8101e-02  1.9067e-01  0.0949 0.9243664    
## 6:residual.sugar        1.0231e-01  1.9067e-01  0.5366 0.5915607    
## 7:residual.sugar        3.3452e-01  1.9449e-01  1.7200 0.0854345 .  
## 8:residual.sugar        4.9144e-01  2.1134e-01  2.3254 0.0200528 *  
## 9:residual.sugar        1.4790e+00  7.7338e-01  1.9123 0.0558322 .  
## 4:chlorides            -9.8593e+00  7.3695e+00 -1.3379 0.1809420    
## 5:chlorides            -1.0064e+01  6.2972e+00 -1.5982 0.1099888    
## 6:chlorides            -9.5982e+00  6.3392e+00 -1.5141 0.1299966    
## 7:chlorides            -2.4082e+01  7.8937e+00 -3.0508 0.0022824 ** 
## 8:chlorides            -1.5080e+01  1.0542e+01 -1.4304 0.1525880    
## 9:chlorides            -2.9459e+02  1.3261e+02 -2.2215 0.0263155 *  
## 4:free.sulfur.dioxide  -8.6527e-02  1.7053e-02 -5.0739 3.898e-07 ***
## 5:free.sulfur.dioxide  -5.3931e-02  1.4413e-02 -3.7419 0.0001826 ***
## 6:free.sulfur.dioxide  -4.6736e-02  1.4278e-02 -3.2734 0.0010626 ** 
## 7:free.sulfur.dioxide  -4.1125e-02  1.4529e-02 -2.8305 0.0046472 ** 
## 8:free.sulfur.dioxide  -2.3049e-02  1.4793e-02 -1.5580 0.1192277    
## 9:free.sulfur.dioxide  -3.8749e-02  5.1578e-02 -0.7513 0.4524933    
## 4:total.sulfur.dioxide  1.1786e-02  1.0212e-02  1.1541 0.2484437    
## 5:total.sulfur.dioxide  1.5538e-02  9.8256e-03  1.5814 0.1137927    
## 6:total.sulfur.dioxide  1.3735e-02  9.8243e-03  1.3980 0.1621021    
## 7:total.sulfur.dioxide  1.3421e-02  9.9491e-03  1.3490 0.1773421    
## 8:total.sulfur.dioxide  1.3333e-02  1.0535e-02  1.2655 0.2056754    
## 9:total.sulfur.dioxide  4.1507e-02  2.4287e-02  1.7090 0.0874419 .  
## 4:density               3.0779e+02  5.1585e+02  0.5967 0.5507324    
## 5:density               1.1458e+02  5.0969e+02  0.2248 0.8221260    
## 6:density               3.1067e+01  5.0950e+02  0.0610 0.9513787    
## 7:density              -5.4652e+02  5.1973e+02 -1.0515 0.2930098    
## 8:density              -8.0023e+02  5.6491e+02 -1.4166 0.1566110    
## 9:density              -3.3686e+03  1.9628e+03 -1.7162 0.0861305 .  
## 4:pH                   -3.3613e+00  2.9866e+00 -1.1254 0.2604074    
## 5:pH                   -4.5219e+00  2.8735e+00 -1.5737 0.1155645    
## 6:pH                   -3.9704e+00  2.8709e+00 -1.3830 0.1666636    
## 7:pH                   -1.1829e+00  2.9029e+00 -0.4075 0.6836633    
## 8:pH                    3.8343e-01  3.0575e+00  0.1254 0.9002001    
## 9:pH                    3.2454e+01  1.2807e+01  2.5341 0.0112738 *  
## 4:sulphates            -9.4671e-01  2.9211e+00 -0.3241 0.7458715    
## 5:sulphates            -1.5138e+00  2.7710e+00 -0.5463 0.5848641    
## 6:sulphates            -6.8293e-02  2.7635e+00 -0.0247 0.9802844    
## 7:sulphates             1.9959e+00  2.7825e+00  0.7173 0.4731891    
## 8:sulphates             2.0382e+00  2.8668e+00  0.7110 0.4771022    
## 9:sulphates            -2.3890e+00  7.9361e+00 -0.3010 0.7633903    
## 4:alcohol               3.2991e-01  7.0319e-01  0.4692 0.6389531    
## 5:alcohol               6.3759e-02  6.9200e-01  0.0921 0.9265888    
## 6:alcohol               8.2709e-01  6.9183e-01  1.1955 0.2318886    
## 7:alcohol               7.7186e-01  7.0244e-01  1.0988 0.2718459    
## 8:alcohol               8.9899e-01  7.4923e-01  1.1999 0.2301810    
## 9:alcohol               2.8399e-01  2.0220e+00  0.1404 0.8883046    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -3699.1
## McFadden R^2:  0.16555 
## Likelihood ratio test : chisq = 1467.8 (p.value = < 2.22e-16)
##Look at percent correct
correct <- mlogitmodel$probabilities
binarycorrect <- colnames(correct)[apply(correct,1,which.max)] 
table(winetrain$quality,binarycorrect)
##    binarycorrect
##        4    5    6    7    8    9
##   3    0    7    6    0    1    0
##   4    6   64   44    1    0    0
##   5    3  537  475    5    0    0
##   6    2  296 1139  102    0    0
##   7    0   28  428  160    0    0
##   8    0    7   76   40    0    0
##   9    0    0    1    2    0    1